This vignette documents the unsupervised analysis pipeline for RNA expression data.
expressions_file <- system.file("extdata/expressions.csv", package = "expr")
RNA_sample_info_file <- system.file("extdata/RNA_sample_info.csv",
package = "expr")
RNA_clinic_info_file <- system.file("extdata/RNA_clinic_info.csv",
package = "expr")
RNA_sample_info <- read.csv(RNA_sample_info_file, header = T,
stringsAsFactors = F, check.names = F)
RNA_clinic_info <- read.csv(RNA_clinic_info_file, header = T,
stringsAsFactors = F, check.names = F)
RNA_sample_info <- table_org(list(RNA_sample_info, RNA_clinic_info))
expressions <- read.csv(expressions_file, header = T, stringsAsFactors = F,
check.names = F)
load(system.file("extdata/protein_coding_ensemble2symbol.RData",
package = "expr"))
load(system.file("extdata/protein_coding_gene_info.RData", package = "expr"))
# set param
paramInput <- data.frame(sample_id_col = "Sample_ID", gene_id_col = "gene_id",
mark_library_size = T, adjusted_by_library_size = T, tolerant_library_size_factor = 1.1,
ens_id_col = "Gene_ID", symbol_col = "Gene_Symbol", mark_purity = T,
remove_low_purity_sample = T, low_purity_threshold = 0.3,
mark_lowexpressgene_pct = T, lowexpression_threshold = 1,
remove_sample_with_intense_lowexpressgene = T, outlier_lowexpressgene_pct_factor = 1.5,
remove_lowexpressgene = T, sample_frequency_threshold = 0.5)
# clean
tmp <- expression(prepare_clean_RNA_sample_info_and_protein_expressions(RNA_sample_info = RNA_sample_info,
sample_id_col = paramInput[["sample_id_col"]], expressions = expressions,
gene_id_col = paramInput[["gene_id_col"]], mark_library_size = paramInput[["mark_library_size"]],
adjusted_by_library_size = paramInput[["adjusted_by_library_size"]],
tolerant_library_size_factor = paramInput[["tolerant_library_size_factor"]],
protein_coding_ensemble2symbol_table = protein_coding_ensemble2symbol,
ens_id_col = paramInput[["ens_id_col"]], symbol_col = paramInput[["symbol_col"]],
mark_purity = paramInput[["mark_purity"]], remove_low_purity_sample = paramInput[["remove_low_purity_sample"]],
low_purity_threshold = paramInput[["low_purity_threshold"]],
mark_lowexpressgene_pct = paramInput[["mark_lowexpressgene_pct"]],
lowexpression_threshold = paramInput[["lowexpression_threshold"]],
remove_sample_with_intense_lowexpressgene = paramInput[["remove_sample_with_intense_lowexpressgene"]],
outlier_lowexpressgene_pct_factor = paramInput[["outlier_lowexpressgene_pct_factor"]],
remove_lowexpressgene = paramInput[["remove_lowexpressgene"]],
sample_frequency_threshold = paramInput[["sample_frequency_threshold"]]))
consoleOutput <- capture.output(results <- eval(tmp), type = c("output"))
# data loading summary
SumInfo <- t(data.frame(paramInput[, grep("factor|threshold",
colnames(paramInput))], Number_of_Cleaned_Samples = dim(results$clean_RNA_sample_info)[1],
Number_of_Cleaned_Genes = dim(results$clean_protein_expressions)[1],
Number_of_Deleted_Samples = dim(results$marked_ori_RNA_sample_info)[1] -
dim(results$clean_RNA_sample_info)[1], Notes = grep("too many low expression genes",
consoleOutput, value = T)))
t1 <- knitr::kable(SumInfo, caption = "Data loading summary") %>%
kableExtra::kable_styling(font_size = "200%")
# cleaned RNA sample info
t2 <- format(results$clean_RNA_sample_info, nsmall = 2) %>%
DT::datatable(options = list(scrollX = T, scrollY = T), caption = htmltools::tags$caption(style = "caption-side: top; text-align: left; color:black; font-size:200% ;",
"Sample Info (cleaned)"))
# log2 transformation
clean_RNA_sample_info <- results[["clean_RNA_sample_info"]]
clean_protein_expressions <- results[["clean_protein_expressions"]] # it is not log2 transformed yet
log2_clean_protein_expressions <- log2(clean_protein_expressions +
1)
# define class of sample attributes
coerce_ind <- names(which(sapply(clean_RNA_sample_info[grep("ID|MRN",
colnames(clean_RNA_sample_info))], is.numeric)))
clean_RNA_sample_info[, coerce_ind] <- as.character(clean_RNA_sample_info[,
coerce_ind])| tolerant_library_size_factor | 1.1 |
| low_purity_threshold | 0.3 |
| lowexpression_threshold | 1 |
| outlier_lowexpressgene_pct_factor | 1.5 |
| sample_frequency_threshold | 0.5 |
| Number_of_Cleaned_Samples | 59 |
| Number_of_Cleaned_Genes | 13756 |
| Number_of_Deleted_Samples | 1 |
| Notes | Sample Cap2115-6-ID04 has (have) too many low expression genes: 84.6735024284943 % respectively. |
In this step, genes with higher variance across data set and a fill rate greater than 0.75 are used for unsupervised analysis.
rm(list = setdiff(ls(), c(lsf.str(), grep("clean|protein_coding_gene_info",
ls(), value = T))))
# set param
paramInput <- data.frame(method = "MAD", mad_top_n = 1000, remove_outlier = F)
# prep data
unsupervised_data <- prepare_unsupervised_data(expressions = log2_clean_protein_expressions,
method = paramInput[["method"]], mad_top_n = paramInput[["mad_top_n"]],
remove_outlier = paramInput[["remove_outlier"]])
assertthat::are_equal(colnames(unsupervised_data), clean_RNA_sample_info[["Sample_ID"]])
#> [1] TRUE
# run analysis
unsupervised_results <- unsupervised_analysis(unsupervised_data,
labels = clean_RNA_sample_info$PROTOCOL, cols = c(`2014-0938` = "green",
`2016-0861` = "blue", `LAB08-0380` = "red"))
# generate plots
p1 <- unsupervised_results$plots$pca$`3D` %>%
layout(title = "PCA", autosize = F)
p2 <- ggplotly(unsupervised_results$plots$umap + theme_bw() +
theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
colnames(unsupervised_data)))), tooltip = c("x", "y", "Label",
"text")) %>%
layout(title = "UMAP")
p3 <- ggplotly(unsupervised_results$plots$tsne + theme_bw() +
theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
colnames(unsupervised_data)))), tooltip = c("x", "y", "Label",
"text")) %>%
layout(title = "tSNE")
p4 <- ggplotly(unsupervised_results$plots$mds + theme_bw() +
theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
colnames(unsupervised_data)))), tooltip = c("x", "y", "Label",
"text")) %>%
layout(title = "MDS")
p5 <- unsupervised_results$plots$heatmapa <- data.frame(x = c(0.2, 0.8, 0.2, 0.8), y = c(1, 1, 0.45,
0.45), text = c("PCA", "UMAP", "tSNE", "MDS"), xref = "paper",
yref = "paper", xanchor = "center", yanchor = "bottom", showarrow = FALSE)
annotations <- apply(a, 1, function(x) lapply(as.list(x), type.convert,
as.is = TRUE))
# grid.draw(cowplot::get_legend(unsupervised_results$plots$umap+theme(legend.position
# = 'top')))
p <- subplot(list(style(p1, showlegend = F), p2, p3, p4), nrows = 2,
margin = 0.1) %>%
layout(title = "Unsupervised clustering before correction",
annotations = annotations, scene = list(domain = list(x = c(0,
0.5), y = c(0.5, 1))))
browsable(tagList(tags$div(style = "width:100%;height=12;display:block;float:left;",
p)))cat("<br>")p5Call Batch effect “exist”, if k-means clustering match with batch definition, and that clusters are clearly separated ( evaluated by Silhouette score and withinss/betweenss). If batch_effect_exist==TRUE, proceed to correction; Otherwise, results in next section stays blank.
kmean.df <- (unsupervised_results$ress$umap$layout)
# calculate Silhouette score for k from 2 to 2 times number
# of batches and determine the optimal k for clustering
avg_sil <- function(k) {
km.res <- kmeans(kmean.df, centers = k, nstart = 25)
ss <- cluster::silhouette(km.res$cluster, dist(kmean.df))
mean(ss[, 3])
}
k.values <- 2:(2 * (length(unique(clean_RNA_sample_info$PROTOCOL))))
avg_sil_values <- sapply(k.values, avg_sil)
km.res <- kmeans(kmean.df, centers = k.values[which.max(avg_sil_values)],
nstart = 25)
# test if good clustering
good <- all(c(km.res$betweenss/km.res$totss > 0.5, max(avg_sil_values) >
0.1))
# test if k-means clustering match with batch definition
clusters <- lapply(unique(km.res$cluster), function(x) names(km.res$cluster)[km.res$cluster ==
x])
batches <- lapply(unique(clean_RNA_sample_info$PROTOCOL), function(x) clean_RNA_sample_info$Sample_ID[clean_RNA_sample_info$PROTOCOL ==
x])
hits <- c()
i <- 0
for (ncluster in 1:length(clusters)) {
for (nbatches in 1:length(batches)) {
hit <- length(setdiff(clusters[[ncluster]], batches[[nbatches]])) ==
0 & length(setdiff(batches[[nbatches]], clusters[[ncluster]])) ==
0
hits <- c(hits, hit)
}
}
matched <- any(hits) == T
batch_effect_exist <- all(matched, good)
cat(sprintf("Clustering good?: **%s**<br>
Clustering matched?: **%s**<br>
Batch effect exists: **%s**<br>",
good, matched, batch_effect_exist))Clustering good?: TRUE
Clustering matched?:
TRUE
Batch effect exists:
TRUE
If batch effect exist, attempt to use limma and
COMBAT to correct such effect
unsupervised_resultss <- list()
if (batch_effect_exist) {
# limma
limma_rbe_log2_protein_expressions <- limma::removeBatchEffect(log2_clean_protein_expressions,
batch = clean_RNA_sample_info$PROTOCOL)
uns_limma_rbe_log2_protein_expressions <- prepare_unsupervised_data(limma_rbe_log2_protein_expressions,
method = "MAD", mad_top_n = 1000)
unsupervised_resultss[["limma"]] <- unsupervised_analysis(uns_limma_rbe_log2_protein_expressions,
labels = clean_RNA_sample_info$PROTOCOL, cols = c(`2014-0938` = "green",
`2016-0861` = "blue", `LAB08-0380` = "red"))
# combat
combat_rbe_log2_protein_expressions <- sva::ComBat(log2_clean_protein_expressions,
batch = clean_RNA_sample_info$PROTOCOL)
uns_combat_rbe_log2_protein_expressions <- prepare_unsupervised_data(combat_rbe_log2_protein_expressions,
method = "MAD", mad_top_n = 1000)
unsupervised_resultss[["combat"]] <- unsupervised_analysis(uns_combat_rbe_log2_protein_expressions,
labels = clean_RNA_sample_info$PROTOCOL, cols = c(`2014-0938` = "green",
`2016-0861` = "blue", `LAB08-0380` = "red"))
# generate plots
p1 <- p2 <- p3 <- p4 <- p5 <- list()
for (method in c("limma", "combat")) {
unsupervised_results <- unsupervised_resultss[[method]]
p1[[method]] <- unsupervised_results$plots$pca$`3D` %>%
layout(title = sprintf("PCA %s", method), autosize = F)
p2[[method]] <- ggplotly(unsupervised_results$plots$umap +
theme_bw() + theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
colnames(unsupervised_data)))), tooltip = c("x",
"y", "Label", "text")) %>%
layout(title = sprintf("UMAP %s", method))
p3[[method]] <- ggplotly(unsupervised_results$plots$tsne +
theme_bw() + theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
colnames(unsupervised_data)))), tooltip = c("x",
"y", "Label", "text")) %>%
layout(title = sprintf("tSNE %s", method))
p4[[method]] <- ggplotly(unsupervised_results$plots$mds +
theme_bw() + theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
colnames(unsupervised_data)))), tooltip = c("x",
"y", "Label", "text")) %>%
layout(title = sprintf("MDS %s", method))
p5[[method]] <- unsupervised_results$plots$heatmap
}
}if (batch_effect_exist) {
a <- data.frame(x = c(0.2, 0.8, 0.2, 0.8), y = c(1, 1, 0.45,
0.45), text = c("PCA", "UMAP", "tSNE", "MDS"), xref = "paper",
yref = "paper", xanchor = "center", yanchor = "bottom",
showarrow = FALSE)
annotations <- apply(a, 1, function(x) lapply(as.list(x),
type.convert, as.is = TRUE))
pp1 <- p1[["limma"]]
pp2 <- p2[["limma"]]
pp3 <- p3[["limma"]]
pp4 <- p4[["limma"]]
p <- subplot(list(style(pp1, showlegend = F), pp2, pp3, pp4),
nrows = 2, margin = 0.1) %>%
layout(title = "Unsupervised clustering after correction",
annotations = annotations, scene = list(domain = list(x = c(0,
0.5), y = c(0.5, 1))))
browsable(tagList(tags$div(style = "width:100%;height=12;display:block;float:left;",
p)))
}if (batch_effect_exist) {
cat("<br>")
p5[["limma"]]
}if (batch_effect_exist) {
a <- data.frame(x = c(0.2, 0.8, 0.2, 0.8), y = c(1, 1, 0.45,
0.45), text = c("PCA", "UMAP", "tSNE", "MDS"), xref = "paper",
yref = "paper", xanchor = "center", yanchor = "bottom",
showarrow = FALSE)
annotations <- apply(a, 1, function(x) lapply(as.list(x),
type.convert, as.is = TRUE))
pp1 <- p1[["combat"]]
pp2 <- p2[["combat"]]
pp3 <- p3[["combat"]]
pp4 <- p4[["combat"]]
p <- subplot(list(style(pp1, showlegend = F), pp2, pp3, pp4),
nrows = 2, margin = 0.1) %>%
layout(title = "Unsupervised clustering after correction",
annotations = annotations, scene = list(domain = list(x = c(0,
0.5), y = c(0.5, 1))))
browsable(tagList(tags$div(style = "width:100%;height=12;display:block;float:left;",
p)))
}if (batch_effect_exist) {
cat("<br>")
p5[["combat"]]
}
#> <br>If batch effect correction was done, use post-batch-effect-removal expression matrix for downstream analysis, eg. unsupervised clustering. Below, limma-corrected expressions were used.
if (batch_effect_exist) {
log2_expressions <- limma_rbe_log2_protein_expressions
} else {
log2_expressions <- log2_clean_protein_expressions
}
# clear environment
rm(list = setdiff(ls(), c(grep("log2_expressions|info", ls(),
value = T))))
# save(clean_RNA_sample_info,log2_expressions,file =
# 'data/woodman.RData')
unsupervised_expressions <- prepare_unsupervised_data(log2_expressions,
method = "MAD", mad_top_n = 1000)
heatmap_mRNA <- draw(Heatmap(t(scale(t(unsupervised_expressions))),
name = "expression\nz_score", top_annotation = getHeatMapAnnotation(sampleAttr = clean_RNA_sample_info,
data_mx = log2_expressions, essentialOnly = T), show_row_names = F,
column_dend_reorder = T, show_column_names = F, column_names_gp = gpar(fontsize = 1)))comp_limma <- dge_limma(log2_expressions, is_rawcount = F, is_logged = T,
normalize = F, sample_frequency_threshold = 0.5, clinic_info = clean_RNA_sample_info,
ID_col = "Sample_ID", group_col = "APOLLO_TIMPOINT", contrasts = c("TP2-Baseline"),
method = "limma_trend")
#> non raw_count data will be analyzed with limma
stats <- cbind(comp_limma$statistics, protein_coding_gene_info[match(rownames(comp_limma$statistics),
protein_coding_gene_info$Gene_name), -1])
stats <- stats[order(stats[["TP2-Baseline:P.Value"]]), ]
format(stats[stats[["TP2-Baseline:P.Value"]] < 0.05, ], nsmall = 2) %>%
DT::datatable(options = list(scrollX = T, scrollY = T), caption = htmltools::tags$caption(style = "caption-side: top; text-align: left; color:black; font-size:200% ;",
"DGE: baseline VS TP2"))
paired_RNA_sample_info <- clean_RNA_sample_info[table(clean_RNA_sample_info$MRN) >
1, ]
# paired_RNA_sample_info<-clean_RNA_clinic[match(paired_RNA_sample_info$Sample_ID,clean_RNA_clinic$Sample_ID),]
paired_log2_expressions <- log2_expressions[, paired_RNA_sample_info$Sample_ID]
assertthat::are_equal(colnames(paired_log2_expressions), paired_RNA_sample_info$Sample_ID)
#> [1] TRUE
paired_comp_limma <- dge_limma(paired_log2_expressions, is_rawcount = F,
is_logged = T, normalize = F, sample_frequency_threshold = 0.5,
clinic_info = paired_RNA_sample_info, ID_col = "Sample_ID",
group_col = "APOLLO_TIMPOINT", contrasts = c("TP2-Baseline"),
method = "limma_trend")
#> non raw_count data will be analyzed with limma
stats <- cbind(paired_comp_limma$statistics, protein_coding_gene_info[match(rownames(paired_comp_limma$statistics),
protein_coding_gene_info$Gene_name), -1])
stats <- stats[order(stats[["TP2-Baseline:P.Value"]]), ]
format(stats[stats[["TP2-Baseline:P.Value"]] < 0.05, ], nsmall = 2) %>%
DT::datatable(options = list(scrollX = T, scrollY = T), caption = htmltools::tags$caption(style = "caption-side: top; text-align: left; color:black; font-size:200% ;",
"DGE: paired baseline VS TP2"))